home *** CD-ROM | disk | FTP | other *** search
- /* WIDE AREA INFORMATION SERVER SOFTWARE:
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
-
- */
-
- #ifndef lint
- static char *RCSid = "$Header: /tmp_mnt/net/quake/proj/wais/wais-8-b5/ir/RCS/irretrvl.c,v 1.30 92/05/10 14:43:59 jonathan Exp $";
- #endif
-
- /* Change log:
- * $Log: irretrvl.c,v $
- * Revision 1.30 92/05/10 14:43:59 jonathan
- *
- * Made a little safer on NULL docid's when parsing.
- *
- * Revision 1.29 92/05/06 17:31:26 jonathan
- * modified #if's for NeXT and Mach. Added S_ISDIR definition for them both.
- *
- * Revision 1.28 92/05/04 17:19:54 jonathan
- * Added test for parsing docids (if null, log error).
- *
- * Revision 1.27 92/04/28 16:56:08 morris
- * added boolean to serial engine
- *
- * Revision 1.26 92/04/01 17:09:46 jonathan
- * Added index_directory to check_for_legitimate_file to test if filename is
- * under default directory (for FTP-like retrieval).
- *
- *
- * Revision 1.25 92/03/18 08:54:41 jonathan
- * Removed databaseName argument from getData and getDocumentText. The
- * database name is now culled from the docid. Removed special cases for INFO
- * and Quest db's, as they should no longer be needed.
- *
- * Revision 1.24 92/02/18 14:04:49 jonathan
- * in check_for_legitimate_file: added INFO to the list of special case
- * retrievals from MAC's.
- *
- * Revision 1.23 92/02/18 11:53:45 jonathan
- * conditionalized use of tempnam for NeXT (doesn't exist, use tmpnam
- * instead). May be a BSD thing.
- *
- * Revision 1.22 92/02/17 12:38:52 jonathan
- * special case catalog in check_for_legitimate_file.
- *
- * Revision 1.21 92/02/16 18:04:52 jonathan
- * Demoted more WLOG_ERROR's to WLOG_WARNING's
- *
- * Revision 1.20 92/02/15 19:40:30 jonathan
- * Improved reporting of retrieval errors.
- *
- * Revision 1.19 92/02/15 18:58:38 jonathan
- * Changed most (but not all) waislog errors to warnings on retrieval.
- *
- * Revision 1.18 92/02/14 16:06:20 jonathan
- * Fixed text in error message for invalid docid (not in DB)
- *
- * Revision 1.17 92/02/14 15:24:08 jonathan
- * Made parseDocID public.
- *
- * Revision 1.16 92/02/12 13:29:35 jonathan
- * Added "$Log" so RCS will put the log message in the header
- *
- */
-
- /* retrieval part of the serial ir engine. if you are using a different
- storage system for the documents, replace this file.
-
- -brewster
-
- 10/91 added .Z file support from mlm@cs.brown.edu (Moises Lejter)
-
- to do:
- handle .Z files at a lower level.
-
- */
-
- #include "irretrvl.h"
- #include "irfiles.h" /* for filename_table_ext */
- #include <string.h>
- #include "futil.h"
- #include <ctype.h> /* for isspace */
- #include "irext.h"
- #include "irdirent.h"
- #include <sys/stat.h>
-
- #ifdef Mach
- #include <sys/inode.h>
- #define S_ISDIR(f_mode) (f_mode & IFDIR)
- #endif /* Mach */
-
- #if (defined(NeXT) && !(defined(S_ISDIR)))
- #define S_ISDIR(f_mode) ((fmode) & S_IFDIR)
- #endif
-
- /*----------------------------------------------------------------------*/
-
-
- boolean
- parseDocID(doc,filename,start_character,end_character,errorCode)
- DocObj* doc;
- char* filename;
- long* start_character;
- long* end_character;
- long* errorCode;
- {
- DocID* theDocID = NULL;
- char* local_id = NULL;
- char* token = NULL;
- long i;
-
- if((theDocID = docIDFromAny(doc->DocumentID)) == NULL)
- return false;
-
- local_id = anyToString(GetLocalID(theDocID));
-
- freeDocID(theDocID);
-
- /* parse the doc id into start pos, end pos, and filename */
- /* first the start char */
- token = local_id;
- for (i = 0; local_id[i] != '\0' && isspace(local_id[i]) == false; i++)
- ;
- if (local_id[i] == '\0')
- {
- waislog(WLOG_HIGH, WLOG_WARNING,
- "Attempt to retrieve data for bad doc-id: '%s'",local_id);
- *errorCode = GDT_BadDocID;
- s_free(local_id);
- return(false);
- }
- local_id[i] = '\0';
- sscanf(token,"%ld",start_character);
- /* now the second char */
- token = local_id + i + 1;
- for (++i; local_id[i] != '\0' && isspace(local_id[i]) == false; i++)
- ;
- if (local_id[i] == '\0')
- {
- waislog(WLOG_HIGH, WLOG_WARNING,
- "Attempt to retrieve data for bad doc-id: '%s'",local_id);
- *errorCode = GDT_BadDocID;
- s_free(local_id);
- return(false);
- }
- local_id[i] = '\0';
- sscanf(token,"%ld",end_character);
- /* and finally the file name */
- strncpy(filename,local_id + i + 1,MAX_FILENAME_LEN);
- s_free(local_id);
- return(true);
- }
-
-
- /*----------------------------------------------------------------------*/
-
- /* this checks to make sure that the filename is a file
- within the database */
-
- static boolean check_for_legitimate_file
- _AP((char *filename, char* database_name, char* index_directory));
-
- static boolean check_for_legitimate_file(filename, database_name, index_directory)
- char *filename;
- char *database_name; /* full pathname of the database */
- char *index_directory;
- {
- struct stat sbuf;
-
- /* the help file and catalog file (the .src and .cat files) must be
- special cased because it is not in the filename table */
-
- /* caching is done in filename_in_filename_file for repeated requests
- for the same file, so it does not need to be repeated here. */
-
- if(NULL != strstr(filename, ".src")) /* let it pass */
- return(true);
-
- if(NULL != strstr(filename, ".cat")) /* let it pass */
- return(true);
-
- stat(filename, &sbuf);
- if(S_ISDIR(sbuf.st_mode)) {
- waislog(WLOG_HIGH, WLOG_WARNING,
- "File: '%s' is a directory, and cannot be retrieved.",
- filename);
- return(false);
- }
- else {
- /* name of the file of the filetable for this db (eg /bar/foo.fn). confusing, no? */
- char filename_table_filename[MAX_FILE_NAME_LEN +1];
-
- pathname_directory(database_name, filename_table_filename);
- strncat(filename_table_filename, "/", MAX_FILE_NAME_LEN);
- strncat(filename_table_filename,
- database_file(pathname_name(database_name)),
- MAX_FILE_NAME_LEN);
- s_strncat(filename_table_filename, filename_table_ext, MAX_FILE_NAME_LEN,
- MAX_FILE_NAME_LEN);
- if(!filename_in_filename_file(filename, NULL, NULL, filename_table_filename)){
- /* we lose. this means either the db does not exist, or
- the file is not in that db. Log the bad news */
- if(index_directory == NULL)
- return true;
- else if (substrcmp(filename, index_directory))
- return true;
- waislog(WLOG_HIGH, WLOG_WARNING,
- "File: '%s' is not in DB '%s', and cannot be retrieved.",
- filename, filename_table_filename);
- return(false);
- }
- else{ /* everything is peachy */
- return(true);
- }
- }
- }
-
-
- /*----------------------------------------------------------------------*/
-
- WAISDocumentText* getData(doc, errorCode, index_directory)
- DocObj* doc;
- long* errorCode;
- char* index_directory;
- /* it isn't text, so we can just grab data */
- {
- FILE* file = NULL;
- char fileName[MAX_FILENAME_LEN + 1];
- char* dbname = NULL;
- WAISDocumentText* data = NULL;
- long start,end; /* position of the document in the file */
- long startByte,endByte,bytes,bytesRead; /* part of the doc that we want */
- char* buffer = NULL;
- any* bufAny = NULL;
- DocID *docid;
- #if (defined(NeXT) || defined(Mach))
- char tmpFileName[MAX_FILENAME_LEN+1];
- #else
- char *tmpFileName = NULL;
- #endif /* NeXT or Mach */
-
- /* we can only handle byte chunks here */
- if ((doc->ChunkCode == CT_byte) ||
- (doc->ChunkCode == CT_document)) {
- if (parseDocID(doc,fileName,&start,&end,errorCode) == false)
- {
- waislog(WLOG_HIGH, WLOG_WARNING, "can't parse docid");
- *errorCode = GDT_MissingDocID;
- return(NULL);
- }
-
- *errorCode = GDT_NoError;
-
- docid = docIDFromAny(doc->DocumentID);
- dbname = anyToString(GetDatabase(docid));
- freeDocID(docid);
-
- if(true == check_for_legitimate_file(fileName, dbname, index_directory)){
- file = s_fopen(fileName,"rb");
-
- if (file == NULL){
- if(probe_file_possibly_compressed(fileName)) {
- char buffer[ 2 * MAX_FILENAME_LEN + 10 ];
- #if (defined(NeXT) || defined(Mach))
- tmpnam(tmpFileName);
- #else
- tmpFileName = tempnam( "/tmp/", 0 );
- #endif /* NeXT or Mach */
- sprintf( buffer, "zcat %s.Z > %s", fileName, tmpFileName );
- system( buffer );
- file = s_fopen(tmpFileName,"rb");
- }
- }
- }
-
- if (file == NULL) {
- waislog(WLOG_HIGH, WLOG_WARNING,
- "Attempt to retrieve data for missing doc-id: '%s'",
- fileName);
- *errorCode = GDT_MissingDocID;
- s_free(dbname);
- return(NULL);
- }
-
- if (doc->ChunkCode == CT_byte) {
- startByte = doc->ChunkStart.Pos + start;
- endByte = doc->ChunkEnd.Pos + start;
- }
- else {
- startByte = start;
- endByte = end;
- }
-
- waislog(WLOG_LOW, WLOG_RETRIEVE,
- "Retrieving DocID: %d %d %s, byte: %d %d, from database %s",
- start, end, fileName, startByte, endByte, dbname);
-
- s_free(dbname);
-
- if (endByte > end && end != 0) {
- waislog(WLOG_HIGH, WLOG_WARNING,
- "retrieval beyond bounds of document %ld in file <%s>",
- endByte,fileName);
- *errorCode = GDT_BadRange;
- endByte = end;
- }
-
- /* get the bytes */
- if (fseek(file,startByte,SEEK_SET) != 0)
- {
- waislog(WLOG_HIGH, WLOG_WARNING,
- "retrieval can't seek to %ld in file <%s>",startByte,
- fileName);
- *errorCode = GDT_BadRange;
- if (tmpFileName) unlink( tmpFileName );
- if (tmpFileName) unlink( tmpFileName );
- if (tmpFileName) unlink( tmpFileName );
- return(NULL);
- }
-
- bytes = endByte - startByte;
- buffer = (char*)s_malloc(bytes);
-
- bytesRead = fread((void*)buffer,(size_t)sizeof(char),bytes,file);
-
- if (bytesRead != bytes)
- {
- waislog(WLOG_HIGH, WLOG_WARNING,
- "retrieval error in file <%s>",fileName);
- *errorCode = GDT_BadRange;
- if (bytesRead == 0)
- return(NULL);
- }
-
- bufAny = makeAny(bytesRead,buffer);
-
- data = makeWAISDocumentText(duplicateAny(doc->DocumentID),0L,bufAny);
-
- /* the any and the buffer are freed by freeWAISSearchResponse() */
- s_fclose(file);
- if (tmpFileName) unlink( tmpFileName );
-
- return(data);
- }
- else
- {
- waislog(WLOG_HIGH, WLOG_WARNING,
- "search engine can only use whole documents or byte offsets for data lookup");
- *errorCode = GDT_UnsupportedChunkType;
- return(NULL);
- }
-
- }
-
- /*----------------------------------------------------------------------*/
-
- #define BUFSZ (size_t)5000
-
- WAISDocumentText* getDocumentText(doc, errorCode, index_directory)
- DocObj* doc;
- long* errorCode;
- char* index_directory;
- /* find the text for doc, get the sub part if any, finally construct and
- return a WAISDocumentText. If it can not find the document
- (or some other error) it returns NULL and sets errorCode.
- */
- {
- WAISDocumentText* text = NULL;
- FILE* file = NULL;
- char* dbname = NULL;
- char* buffer = NULL;
- any* bufAny = NULL;
- char filename[MAX_FILENAME_LEN + 1];
- long start_character;
- long end_character;
- register long i;
- long bytes,bytesRead;
- long startByte,endByte,byte,lines;
- #if (defined(NeXT) || defined(Mach))
- char tmpFileName[MAX_FILENAME_LEN+1];
- #else
- char *tmpFileName = NULL;
- #endif /* NeXT or Mach */
- DocID* theDocID = NULL;
- char* local_id = NULL;
-
- *errorCode = GDT_NoError;
-
- /* we can only handle line chunks for now */
- if (doc->ChunkCode != CT_line)
- {
- waislog(WLOG_HIGH, WLOG_WARNING,
- "search engine can only use line offsets for now.");
- *errorCode = GDT_UnsupportedChunkType;
- return(NULL);
- }
-
- theDocID = docIDFromAny(doc->DocumentID);
- dbname = anyToString(GetDatabase(theDocID));
- local_id = anyToString(GetLocalID(theDocID));
- freeDocID(theDocID);
-
- if (parseDocID(doc,filename,&start_character,&end_character,errorCode) ==
- false) {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "Can't parse doc-id: '%s'", local_id);
- *errorCode = GDT_MissingDocID;
- s_free(dbname);
- s_free(local_id);
- return(NULL);
- }
-
- waislog(WLOG_LOW, WLOG_RETRIEVE,
- "Retrieving DocID: '%s', line range: %d %d, from database %s",
- local_id, doc->ChunkStart.Pos, doc->ChunkEnd.Pos,
- dbname);
- /* check the database */
- if(NULL == dbname){
- waislog(WLOG_HIGH, WLOG_WARNING,
- "Missing database for doc-id: '%s'", local_id);
- *errorCode = GDT_MissingDatabase;
- s_free(local_id);
- return(NULL);
- }
-
- if(check_for_legitimate_file(filename, dbname, index_directory) == false){
- waislog(WLOG_HIGH, WLOG_WARNING,
- "doc-id: '%s' not in database '%s'", local_id,dbname);
- *errorCode = GDT_MissingDocID;
- s_free(dbname);
- s_free(local_id);
- return(NULL);
- }
-
- s_free(dbname);
-
- file = s_fopen(filename,"r");
- if (file == NULL)
- if(probe_file_possibly_compressed(filename)) {
- char buffer[ 2 * MAX_FILENAME_LEN + 10 ];
- #if (defined(NeXT) || defined(Mach))
- tmpnam(tmpFileName);
- #else
- tmpFileName = tempnam( "/tmp/", 0 );
- #endif /* NeXT or Mach */
- sprintf( buffer, "zcat %s.Z > %s", filename, tmpFileName );
- system( buffer );
- file = s_fopen(tmpFileName,"r");
- }
- if (file == NULL) {
- waislog(WLOG_HIGH, WLOG_WARNING,
- "Attempt to retrieve text for bad doc-id: '%s'", local_id);
- *errorCode = GDT_MissingDocID;
- s_free(local_id);
- return(NULL);
- }
-
- if(0 != fseek(file, start_character, SEEK_SET))
- {
- waislog(WLOG_HIGH, WLOG_WARNING,
- " error on attempt to seek into file for doc-id: '%s'", local_id);
- s_free(local_id);
- *errorCode = GDT_BadRange;
- return(NULL);
- }
- /* find the start byte */
- buffer = (char*)s_malloc(BUFSZ);
- lines = byte = 0;
- while (lines < doc->ChunkStart.Pos)
- { /* search a buffer full */
- bytesRead = fread(buffer,(size_t)sizeof(char),BUFSZ,file);
- for (i = 0; i < bytesRead && lines < doc->ChunkStart.Pos; i++, byte++)
- { if (buffer[i] == '\n' || buffer[i] == '\r')
- /* \r should not happen because we are reading the file in text
- mode */
- lines++;
- }
- if (bytesRead == 0) /* cheasy handling files that don't end with nl */
- lines++;
- }
- startByte = byte;
-
- beFriendly();
-
- /* find the end byte */ /* this could be done while getting the bytes XXX */
- /* search starting form the start pos */
- if (fseek(file,startByte + start_character,SEEK_SET) != 0)
- {
- waislog(WLOG_HIGH, WLOG_WARNING,
- "retrieval can't seek to %ld in file <%s>",
- startByte,filename);
-
- *errorCode = GDT_BadRange;
- if (tmpFileName) unlink( tmpFileName );
- s_free(local_id);
- return(NULL);
- }
-
- beFriendly();
-
- while (lines < doc->ChunkEnd.Pos)
- { /* search a buffer full */
- bytesRead = fread(buffer,(size_t)sizeof(char),BUFSZ,file);
- for (i = 0; i < bytesRead && lines < doc->ChunkEnd.Pos; i++, byte++)
- { if (buffer[i] == '\n' || buffer[i] == '\r')
- /* \r should not happen, we are reading the file in text mode */
- lines++;
- }
- if (bytesRead == 0) /* cheasy handling of files that don't end with nl */
- lines++;
- }
- endByte = byte;
-
- beFriendly();
-
- s_free(buffer);
-
- /* get the bytes */
- if (fseek(file,startByte + start_character,SEEK_SET) != 0)
- {
- waislog(WLOG_HIGH, WLOG_WARNING,
- "retrieval can't seek to %ld in file <%s>",startByte,
- filename);
-
- *errorCode = GDT_BadRange;
- if (tmpFileName) unlink( tmpFileName );
- s_free(local_id);
- return(NULL);
- }
-
- bytes = endByte - startByte;
- buffer = (char*)s_malloc(bytes);
-
- bytesRead = fread((void*)buffer,(size_t)sizeof(char),bytes,file);
-
- if (bytesRead != bytes)
- {
- waislog(WLOG_HIGH, WLOG_WARNING,
- "retrieval error in file <%s>",filename);
-
- *errorCode = GDT_BadRange;
- if (tmpFileName) unlink( tmpFileName );
- s_free(local_id);
- return(NULL);
- }
-
- bufAny = makeAny(bytesRead,buffer);
-
- text = makeWAISDocumentText(duplicateAny(doc->DocumentID),0L,bufAny);
-
- /* the any and the buffer are freed by freeWAISSearchResponse() */
- s_fclose(file);
- if (tmpFileName) unlink( tmpFileName );
- *errorCode = GDT_NoError;
- s_free(local_id);
- return(text);
- }
-